# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Analysis of intermarriage rates by level of urbanization.

library(tidyverse)
library(here)
library(ggpubr)

weighted_se <- function(x, w, na.rm = TRUE) {
    if (na.rm) {
        complete_cases <- !is.na(x) & !is.na(w)
        x <- x[complete_cases]
        w <- w[complete_cases]
    }
    n <- length(x)
    if (n <= 1) return(NA)
    
    weighted_mean <- sum(w * x) / sum(w)
    weighted_var <- sum(w * (x - weighted_mean)^2) / sum(w)
    effective_n <- (sum(w))^2 / sum(w^2)

    sqrt(weighted_var / effective_n)
}

marriages <- read_csv(here("data", "clean", "sample.csv")) %>%
  mutate(across(where(is.factor), as.character)) %>%
  mutate(Intergroup = Ethnicity_Head != Ethnicity_Spouse) %>%
  mutate(Interblock = EthnicBlock_Head != EthnicBlock_Spouse) %>%
  filter(FullReside == T) %>%
  filter(Polygamous_Head == "No more than one wife linked via SPLOC")

hhi_by_level <- read_csv(here("out", "inter", "hhi_by_level.csv"))
hhi_by_decade <- read_csv(here("out", "inter", "hhi_by_decade.csv"))
hhi_by_const_decade <- read_csv(here("out", "inter", "hhi_by_const_decade.csv"))
n_gc_ethnicity <- read_csv(here("out", "inter", "n_gc_ethnicity.csv"))
n_gc_ethnic_block <- read_csv(here("out", "inter", "n_gc_ethnic_block.csv"))

r_gc_ethnicity <- marriages %>%
  filter(!is.na(Ethnicity_Head) & !is.na(Const) & !is.na(DecadeMarried)) %>%
  group_by(Ethnicity_Head, Const, District, DecadeMarried) %>%
  summarise(
    r_gc = mean(Intergroup, na.rm = TRUE),
    n_marriages_gc = n(),
    .groups = "drop"
  )

r_gc_ethnic_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head) & !is.na(Const) & !is.na(DecadeMarried)) %>%
  group_by(EthnicBlock_Head, Const, District, DecadeMarried) %>%
  summarise(
    r_gc = mean(Interblock, na.rm = TRUE),
    n_marriages_gc = n(),
    .groups = "drop"
  )

observed_intermarriage_group <- r_gc_ethnicity %>%
  inner_join(
    n_gc_ethnicity,
    by = c("Ethnicity_Head" = "Ethnicity", "Const", "District", "DecadeMarried" = "Decade")
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    observed_rate_group = weighted.mean(r_gc, n_gc, na.rm = TRUE),
    total_marriages = sum(n_marriages_gc, na.rm = TRUE),
    .groups = "drop"
  )

observed_intermarriage_block <- r_gc_ethnic_block %>%
  inner_join(
    n_gc_ethnic_block,
    by = c("EthnicBlock_Head" = "EthnicBlock", "Const", "District", "DecadeMarried" = "Decade")
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    observed_rate_block = weighted.mean(r_gc, n_gc, na.rm = TRUE),
    total_marriages = sum(n_marriages_gc, na.rm = TRUE),
    .groups = "drop"
  )

n_c_ethnicity <- n_gc_ethnicity %>%
  group_by(Const, District, Decade) %>%
  summarise(n_c = sum(n_gc, na.rm = TRUE), .groups = "drop")

p_gc_ethnicity <- n_gc_ethnicity %>%
  inner_join(n_c_ethnicity, by = c("Const", "District", "Decade")) %>%
  mutate(p_gc = n_gc / n_c)

combined_data_group <- r_gc_ethnicity %>%
  inner_join(
    p_gc_ethnicity,
    by = c("Ethnicity_Head" = "Ethnicity", "Const", "District", "DecadeMarried" = "Decade")
  )

I_group_share_ethnicity <- combined_data_group %>%
  filter(p_gc < 1) %>%
  mutate(
    component = (r_gc / (1 - p_gc)) * p_gc,
    expectation = 1 - p_gc
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    I_group_share = sum(component, na.rm = TRUE),
    avg_observed_rate = weighted.mean(r_gc, w = n_gc, na.rm = TRUE),
    avg_expectation = weighted.mean(expectation, w = n_gc, na.rm = TRUE),
    total_n_group = sum(n_gc, na.rm = TRUE),
    .groups = "drop"
  )

n_c_ethnic_block <- n_gc_ethnic_block %>%
  group_by(Const, District, Decade) %>%
  summarise(n_c = sum(n_gc, na.rm = TRUE), .groups = "drop")

p_gc_ethnic_block <- n_gc_ethnic_block %>%
  inner_join(n_c_ethnic_block, by = c("Const", "District", "Decade")) %>%
  mutate(p_gc = n_gc / n_c)

combined_data_block <- r_gc_ethnic_block %>%
  inner_join(
    p_gc_ethnic_block,
    by = c("EthnicBlock_Head" = "EthnicBlock", "Const", "District", "DecadeMarried" = "Decade")
  )

I_group_share_block <- combined_data_block %>%
  filter(p_gc < 1) %>%
  mutate(
    component = (r_gc / (1 - p_gc)) * p_gc,
    expectation = 1 - p_gc
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    I_group_share = sum(component, na.rm = TRUE),
    avg_observed_rate = weighted.mean(r_gc, w = n_gc, na.rm = TRUE),
    avg_expectation = weighted.mean(expectation, w = n_gc, na.rm = TRUE),
    total_n_block = sum(n_gc, na.rm = TRUE),
    .groups = "drop"
  )

urbanization_lookup <- marriages %>%
  select(Const, District, DecadeMarried, Urbanization) %>%
  distinct() %>%
  mutate(
    UrbanBin = cut(Urbanization, 
                   breaks = seq(0, 1, by = 0.1), 
                   labels = c("0-10%", "10-20%", "20-30%", "30-40%", "40-50%", 
                              "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"),
                   include.lowest = TRUE)
  )

aggregate_by_urban_bin <- function(data, weight_col, type_label) {
  data %>%
    left_join(urbanization_lookup, by = c("Const", "District", "DecadeMarried")) %>%
    filter(!is.na(UrbanBin)) %>%
    group_by(UrbanBin) %>%
    summarise(
      I_group_share_national = weighted.mean(I_group_share, w = !!sym(weight_col), na.rm = TRUE),
      I_group_share_national_se = weighted_se(I_group_share, w = !!sym(weight_col), na.rm = TRUE),
      avg_observed_rate_national = weighted.mean(avg_observed_rate, w = !!sym(weight_col), na.rm = TRUE),
      avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = !!sym(weight_col), na.rm = TRUE),
      total_n = sum(!!sym(weight_col), na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(Group_Type = type_label)
}

results_block <- aggregate_by_urban_bin(I_group_share_block, "total_n_block", "Ethnic Block")
results_group <- aggregate_by_urban_bin(I_group_share_ethnicity, "total_n_group", "Ethnic Group")

urbanization_results <- bind_rows(results_block, results_group)

if (!dir.exists(here("out", "final"))) {
  dir.create(here("out", "final"), recursive = TRUE)
}

write_csv(urbanization_results, here("out", "final", "appendix_urbanization.csv"))

plot_urbanization <- function(data, title_text) {
  data_long <- data %>%
    select(UrbanBin, Observed = avg_observed_rate_national, Adjusted = I_group_share_national, 
           Observed_SE = avg_observed_rate_national_se, Adjusted_SE = I_group_share_national_se) %>%
    pivot_longer(cols = c(Observed, Adjusted), names_to = "Measure", values_to = "Rate") %>%
    mutate(
      SE = if_else(Measure == "Observed", Observed_SE, Adjusted_SE),
      ymin = Rate - 1.96 * SE,
      ymax = Rate + 1.96 * SE
    )

  ggplot(data_long, aes(x = UrbanBin, y = Rate)) +
    geom_bar(stat = "identity", fill = "gray70") +
    geom_errorbar(aes(ymin = ymin, ymax = ymax), width = 0.25) +
    theme_pubclean() +
    labs(
      title = title_text,
      x = "Urbanization Level",
      y = "Intermarriage Rate"
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    facet_wrap(~ Measure)
}

figA1 <- plot_urbanization(results_block, "Intermarriage Rates by Level of Urbanization (Ethnic Block)")
ggsave(here("figures", "Figure_A1.png"), figA1, width = 8, height = 5, dpi = 300)

figA2 <- plot_urbanization(results_group, "Intermarriage Rates by Level of Urbanization (Ethnic Group)")
ggsave(here("figures", "Figure_A2.png"), figA2, width = 8, height = 5, dpi = 300)

cat("\nAnalysis complete. Figures saved to 'figures/' and data to 'out/final/appendix_urbanization.csv'.\n")
